In Class Exercise 2

Published

February 13, 2023

Install Packages

Tip

Packages are important. Try to use SF, not SP

Show the code
pacman::p_load(sf, tidyverse, funModeling, tmap)

Loading the file (Convert to the projection)

geoNGA <- st_read("data/geospatial/nigeria_nga_l2/", layer="geoBoundaries-NGA-ADM2") %>% 
  st_transform(crs=26392)
Reading layer `geoBoundaries-NGA-ADM2' from data source 
  `D:\Users\Wen Yang\Documents\wyt05\IS415-GAA-WY\lessons\lesson2\data\geospatial\nigeria_nga_l2' 
  using driver `ESRI Shapefile'
Simple feature collection with 774 features and 6 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 2.668534 ymin: 4.273007 xmax: 14.67882 ymax: 13.89442
Geodetic CRS:  WGS 84

The below file is the same as the above.

Except this has more information

NGA <- st_read("data/geospatial/nigeria_nga_l2/", layer="nga_admbnda_adm2_osgof_20190417") %>% 
  st_transform(crs=26392)
Reading layer `nga_admbnda_adm2_osgof_20190417' from data source 
  `D:\Users\Wen Yang\Documents\wyt05\IS415-GAA-WY\lessons\lesson2\data\geospatial\nigeria_nga_l2' 
  using driver `ESRI Shapefile'
Simple feature collection with 774 features and 16 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 2.668534 ymin: 4.273007 xmax: 14.67882 ymax: 13.89442
Geodetic CRS:  WGS 84

We pick the second one as it gives the state and LGA (local government area) boundary

Water point data for Nigeria

wp_nga <- read_csv("data/aspatial/water_point_data_exchange/wpdx.csv") %>% filter(`#clean_country_name`=="Nigeria")

Convert water point data into sf point features

Note

You can take the latitude degree and longitude degress

wp_nga$Geometry <- st_as_sfc(wp_nga$`New Georeferenced Column`)
wp_nga
# A tibble: 95,008 × 71
   row_id `#source`      #lat_…¹ #lon_…² #repo…³ #stat…⁴ #wate…⁵ #wate…⁶ #wate…⁷
    <dbl> <chr>            <dbl>   <dbl> <chr>   <chr>   <chr>   <chr>   <chr>  
 1 429068 GRID3             7.98    5.12 08/29/… Unknown <NA>    <NA>    Tapsta…
 2 222071 Federal Minis…    6.96    3.60 08/16/… Yes     Boreho… Well    Mechan…
 3 160612 WaterAid          6.49    7.93 12/04/… Yes     Boreho… Well    Hand P…
 4 160669 WaterAid          6.73    7.65 12/04/… Yes     Boreho… Well    <NA>   
 5 160642 WaterAid          6.78    7.66 12/04/… Yes     Boreho… Well    Hand P…
 6 160628 WaterAid          6.96    7.78 12/04/… Yes     Boreho… Well    Hand P…
 7 160632 WaterAid          7.02    7.84 12/04/… Yes     Boreho… Well    Hand P…
 8 642747 Living Water …    7.33    8.98 10/03/… Yes     Boreho… Well    Mechan…
 9 642456 Living Water …    7.17    9.11 10/03/… Yes     Boreho… Well    Hand P…
10 641347 Living Water …    7.20    9.22 03/28/… Yes     Boreho… Well    Hand P…
# … with 94,998 more rows, 62 more variables: `#water_tech_category` <chr>,
#   `#facility_type` <chr>, `#clean_country_name` <chr>, `#clean_adm1` <chr>,
#   `#clean_adm2` <chr>, `#clean_adm3` <chr>, `#clean_adm4` <chr>,
#   `#install_year` <dbl>, `#installer` <chr>, `#rehab_year` <lgl>,
#   `#rehabilitator` <lgl>, `#management_clean` <chr>, `#status_clean` <chr>,
#   `#pay` <chr>, `#fecal_coliform_presence` <chr>,
#   `#fecal_coliform_value` <dbl>, `#subjective_quality` <chr>, …

Convert tibble dataframe into SF object

Note

Need to convert Aspatial into Geospatial, but they do not have projection.

So need to tell R what is the projection in Aspatial (if it is wgs84, reconvert to that) -> then transform from 26392

wp_sf <- st_sf(wp_nga, crs=4326)
wp_sf
Simple feature collection with 95008 features and 70 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: 2.707441 ymin: 4.301812 xmax: 14.21828 ymax: 13.86568
Geodetic CRS:  WGS 84
# A tibble: 95,008 × 71
   row_id `#source`      #lat_…¹ #lon_…² #repo…³ #stat…⁴ #wate…⁵ #wate…⁶ #wate…⁷
 *  <dbl> <chr>            <dbl>   <dbl> <chr>   <chr>   <chr>   <chr>   <chr>  
 1 429068 GRID3             7.98    5.12 08/29/… Unknown <NA>    <NA>    Tapsta…
 2 222071 Federal Minis…    6.96    3.60 08/16/… Yes     Boreho… Well    Mechan…
 3 160612 WaterAid          6.49    7.93 12/04/… Yes     Boreho… Well    Hand P…
 4 160669 WaterAid          6.73    7.65 12/04/… Yes     Boreho… Well    <NA>   
 5 160642 WaterAid          6.78    7.66 12/04/… Yes     Boreho… Well    Hand P…
 6 160628 WaterAid          6.96    7.78 12/04/… Yes     Boreho… Well    Hand P…
 7 160632 WaterAid          7.02    7.84 12/04/… Yes     Boreho… Well    Hand P…
 8 642747 Living Water …    7.33    8.98 10/03/… Yes     Boreho… Well    Mechan…
 9 642456 Living Water …    7.17    9.11 10/03/… Yes     Boreho… Well    Hand P…
10 641347 Living Water …    7.20    9.22 03/28/… Yes     Boreho… Well    Hand P…
# … with 94,998 more rows, 62 more variables: `#water_tech_category` <chr>,
#   `#facility_type` <chr>, `#clean_country_name` <chr>, `#clean_adm1` <chr>,
#   `#clean_adm2` <chr>, `#clean_adm3` <chr>, `#clean_adm4` <chr>,
#   `#install_year` <dbl>, `#installer` <chr>, `#rehab_year` <lgl>,
#   `#rehabilitator` <lgl>, `#management_clean` <chr>, `#status_clean` <chr>,
#   `#pay` <chr>, `#fecal_coliform_presence` <chr>,
#   `#fecal_coliform_value` <dbl>, `#subjective_quality` <chr>, …

Convert to the Nigeria projection system

wp_sf <- wp_sf %>% st_transform(crs=26392)
wp_sf
Simple feature collection with 95008 features and 70 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: 28907.91 ymin: 33736.93 xmax: 1293293 ymax: 1092883
Projected CRS: Minna / Nigeria Mid Belt
# A tibble: 95,008 × 71
   row_id `#source`      #lat_…¹ #lon_…² #repo…³ #stat…⁴ #wate…⁵ #wate…⁶ #wate…⁷
 *  <dbl> <chr>            <dbl>   <dbl> <chr>   <chr>   <chr>   <chr>   <chr>  
 1 429068 GRID3             7.98    5.12 08/29/… Unknown <NA>    <NA>    Tapsta…
 2 222071 Federal Minis…    6.96    3.60 08/16/… Yes     Boreho… Well    Mechan…
 3 160612 WaterAid          6.49    7.93 12/04/… Yes     Boreho… Well    Hand P…
 4 160669 WaterAid          6.73    7.65 12/04/… Yes     Boreho… Well    <NA>   
 5 160642 WaterAid          6.78    7.66 12/04/… Yes     Boreho… Well    Hand P…
 6 160628 WaterAid          6.96    7.78 12/04/… Yes     Boreho… Well    Hand P…
 7 160632 WaterAid          7.02    7.84 12/04/… Yes     Boreho… Well    Hand P…
 8 642747 Living Water …    7.33    8.98 10/03/… Yes     Boreho… Well    Mechan…
 9 642456 Living Water …    7.17    9.11 10/03/… Yes     Boreho… Well    Hand P…
10 641347 Living Water …    7.20    9.22 03/28/… Yes     Boreho… Well    Hand P…
# … with 94,998 more rows, 62 more variables: `#water_tech_category` <chr>,
#   `#facility_type` <chr>, `#clean_country_name` <chr>, `#clean_adm1` <chr>,
#   `#clean_adm2` <chr>, `#clean_adm3` <chr>, `#clean_adm4` <chr>,
#   `#install_year` <dbl>, `#installer` <chr>, `#rehab_year` <lgl>,
#   `#rehabilitator` <lgl>, `#management_clean` <chr>, `#status_clean` <chr>,
#   `#pay` <chr>, `#fecal_coliform_presence` <chr>,
#   `#fecal_coliform_value` <dbl>, `#subjective_quality` <chr>, …

Excluding Redundant fields

The code below takes only the relevant column (column 3,4,8,9). As we only need them

NGA <- NGA %>% 
  select(c(3:4, 8:9))

Checking for duplicate name

Check for the quality of the data (data duplication, spatial data we want to check for missing value)

In this case, we have a lot of duplicated fields

NGA$ADM2_EN[duplicated(NGA$ADM2_EN)==TRUE]
[1] "Bassa"    "Ifelodun" "Irepodun" "Nasarawa" "Obi"      "Surulere"

The reason why is because there are 6 LGAs with the same name, but are in different states.

Append the state to ensure no duplicatation

NGA$ADM2_EN[94] <- "Bassa, Kogi"
NGA$ADM2_EN[95] <- "Bassa, Plateau"
NGA$ADM2_EN[304] <- "Ifelodun, Kwara"
NGA$ADM2_EN[305] <- "Ifelodun, Osun"
NGA$ADM2_EN[355] <- "Irepodun, Kwara"
NGA$ADM2_EN[356] <- "Irepodun, Osun"
NGA$ADM2_EN[519] <- "Nasawara, Kano"
NGA$ADM2_EN[520] <- "Nasawara, Nasawara"
NGA$ADM2_EN[546] <- "Obi, Benue"
NGA$ADM2_EN[547] <- "Obi, Nasawara"
NGA$ADM2_EN[693] <- "Surulere, Lagos"
NGA$ADM2_EN[694] <- "Surulere, Oyo"
freq(data = wp_sf, input = "#status_clean")

                     #status_clean frequency percentage cumulative_perc
1                       Functional     45883      48.29           48.29
2                   Non-Functional     29385      30.93           79.22
3                             <NA>     10656      11.22           90.44
4      Functional but needs repair      4579       4.82           95.26
5 Non-Functional due to dry season      2403       2.53           97.79
6        Functional but not in use      1686       1.77           99.56
7         Abandoned/Decommissioned       234       0.25           99.81
8                        Abandoned       175       0.18           99.99
9 Non functional due to dry season         7       0.01          100.00

Mutate allows us to do data processing, replacing #status_clean to remove the ‘#’

We replace all na fields with ‘unknown’

wp_sf_nga <- wp_sf %>%
  rename(status_clean = "#status_clean") %>%
  select(status_clean) %>%
  mutate(status_clean = replace_na(status_clean, "unknown"))

SF is a simple feature object, it will always have a geometric field, even though we only select one column which is ‘status_clean’

Group them by:

Functional

wp_functional <- wp_sf_nga %>%
  filter(status_clean %in% c("Functional", 
                             "Functional but not in use", 
                             "Functional but needs repair"))

Non-functional

wp_nonfunctional <- wp_sf_nga %>%
  filter(status_clean %in% c("Abandoned/Decommissioned", 
                             "Abandoned", 
                             "Non-Functional due to dry season",
                             "Non-Functional",
                             "Non functional due to dry season"))

Unknown

wp_unknown <- wp_sf_nga %>%
  filter(status_clean %in% c("unknown"))

Append the water point count into the new dataframe

Note

The code below tells us how many water point intersects each LGA. (functional, nonfunctional, unknown) then append that information into the original NGA dataframe by creating a new dataframe NGA_wp

NGA_wp <- NGA %>%
  mutate(`total_wp` = lengths(
    st_intersects(NGA, wp_sf_nga))) %>%
  mutate(`wp_functional` = lengths(
    st_intersects(NGA, wp_functional))) %>%
  mutate(`wp_nonfunctional` = lengths(
    st_intersects(NGA, wp_nonfunctional))) %>%
  mutate(`wp_unknown` = lengths(
    st_intersects(NGA, wp_unknown)))

Plot the distribution of water point

ggplot(data = NGA_wp, aes(x = total_wp)) +
  geom_histogram(bins=20, color="black", fill="light blue") +
  geom_vline(aes(xintercept=mean(total_wp,na.rm=T)), color="red",linetype="dashed", size=0.8) +
  ggtitle("Distribution of total water points by LGA") +
  xlab("No. of water points") + 
  ylab("No. of\nLGAs") +
  theme(axis.title.y=element_text(angle=0))

tmap_mode('view') +
  tm_shape(wp_sf_nga) +
  tm_dots(col="status_clean",
          size=0.01,
          border.lwd=0.5) + 
  tm_view(set.zoom.limits = c(9,16))

Saving the analytic data in rds format

write_rds(NGA_wp, "data/rds/NGA_wp.rds")